{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "docA = \"The cat sat on my bed\"\n",
    "docB = \"The dog sat on my knees\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['The', 'cat', 'sat', 'on', 'my', 'bed']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bowA = docA.split(\" \")\n",
    "bowB = docB.split(\" \")\n",
    "bowA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'The', 'bed', 'cat', 'dog', 'knees', 'my', 'on', 'sat'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 构建词库\n",
    "wordSet = set(bowA).union(bowB)\n",
    "wordSet"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 进行次数统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>The</th>\n",
       "      <th>bed</th>\n",
       "      <th>cat</th>\n",
       "      <th>dog</th>\n",
       "      <th>knees</th>\n",
       "      <th>my</th>\n",
       "      <th>on</th>\n",
       "      <th>sat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>The</th>\n",
       "      <th>bed</th>\n",
       "      <th>cat</th>\n",
       "      <th>dog</th>\n",
       "      <th>knees</th>\n",
       "      <th>my</th>\n",
       "      <th>on</th>\n",
       "      <th>sat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 用统计字典来保存词出现的次数\n",
    "wordDictA = dict.fromkeys(wordSet, 0)\n",
    "wordDictB = dict.fromkeys(wordSet, 0)\n",
    "# 遍历文档，统计词数\n",
    "for word in bowA:\n",
    "    wordDictA[word] += 1\n",
    "for word in bowB:\n",
    "    wordDictB[word] += 1\n",
    "pd.DataFrame([wordDictA, wordDictB])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 计算词频TF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def computeTF(wordDict, bow):\n",
    "    # 用一个字典对象记录tf，把所有的词对应在bow文档里的tf都算出来\n",
    "    tfDict = {}\n",
    "    nbowCount = len(bow)\n",
    "    for word, count in wordDict.items():\n",
    "        tfDict[word] = count/nbowCount\n",
    "    return tfDict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'knees': 0.0, 'on': 0.16666666666666666, 'my': 0.16666666666666666, 'bed': 0.16666666666666666, 'dog': 0.0, 'cat': 0.16666666666666666, 'sat': 0.16666666666666666, 'The': 0.16666666666666666}\n{'knees': 0.16666666666666666, 'on': 0.16666666666666666, 'my': 0.16666666666666666, 'bed': 0.0, 'dog': 0.16666666666666666, 'cat': 0.0, 'sat': 0.16666666666666666, 'The': 0.16666666666666666}\n"
     ]
    }
   ],
   "source": [
    "tfA = computeTF(wordDictA, bowA)\n",
    "print(tfA)\n",
    "tfB = computeTF(wordDictB, bowB)\n",
    "print(tfB)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 计算逆文档频率IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'The': 0.0,\n 'bed': 0.17609125905568124,\n 'cat': 0.17609125905568124,\n 'dog': 0.17609125905568124,\n 'knees': 0.17609125905568124,\n 'my': 0.0,\n 'on': 0.0,\n 'sat': 0.0}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def computeIDF(wordDictList):\n",
    "    # 用一个字典对象保存IDF结果，每个词作为key，初始值为0\n",
    "    idfDict = dict.fromkeys(wordDictList[0], 0)\n",
    "    # 总文档数量\n",
    "    N = len(wordDictList)\n",
    "    import math\n",
    "    for wordDict in wordDictList:\n",
    "        # 遍历字典中的每个词汇，统计Ni\n",
    "        for word, count in wordDict.items():\n",
    "            if count > 0:\n",
    "                # 先把Ni增加1，加入到idfDict\n",
    "                idfDict[word] += 1\n",
    "\n",
    "    # 已经得等所有词汇i对应的Ni，现在根据公式把他替换成为IDF值\n",
    "    for word, ni in idfDict.items():\n",
    "        idfDict[word] = math.log10((N + 1) / (ni + 1))\n",
    "        \n",
    "    return idfDict\n",
    "idfs = computeIDF([wordDictA, wordDictB])\n",
    "idfs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 计算TF-IDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>The</th>\n",
       "      <th>bed</th>\n",
       "      <th>cat</th>\n",
       "      <th>dog</th>\n",
       "      <th>knees</th>\n",
       "      <th>my</th>\n",
       "      <th>on</th>\n",
       "      <th>sat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>The</th>\n",
       "      <th>bed</th>\n",
       "      <th>cat</th>\n",
       "      <th>dog</th>\n",
       "      <th>knees</th>\n",
       "      <th>my</th>\n",
       "      <th>on</th>\n",
       "      <th>sat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.029349</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def computeTFIDF(tf, idfs):\n",
    "    tfidf = {}\n",
    "    for word, tfVal in tf.items():\n",
    "        tfidf[word] = tfVal * idfs[word]\n",
    "    return tfidf\n",
    "tfidfA = computeTFIDF(tfA, idfs)\n",
    "tfidfB = computeTFIDF(tfB, idfs)\n",
    "pd.DataFrame([tfidfA, tfidfB])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"TF-IDF\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
